###Carrega os dados
df_class <- fread("/dados/gustavo_cruz/novo/classification_train.csv",
                  sep = ",", dec = ".")

df_class_test <- fread("/dados/gustavo_cruz/novo/classification_test.csv",
                       sep = ",", dec = ".")
###Função skim verifica a disposição dos dados e todas as suas caracteristicas
df_class %>% skim
── Data Summary ────────────────────────
                           Values    
Name                       Piped data
Number of rows             670       
Number of columns          3         
Key                        NULL      
_______________________              
Column type frequency:               
  numeric                  3         
________________________             
Group variables            None      

── Variable type: numeric ──────────────────────────────────────────────
  skim_variable n_missing complete_rate  mean    sd    p0    p25   p50
1 x1                    0             1 0.482 0.915 -1.94 -0.170 0.492
2 x2                    0             1 0.263 0.615 -1.31 -0.159 0.276
3 target                0             1 0.506 0.500  0     0     1    
    p75  p100 hist 
1 1.16   2.59 ▁▅▇▇▃
2 0.718  1.90 ▂▅▇▅▁
3 1      1    ▇▁▁▁▇
##3tabela de correlações --- é possível verificar que x1 tem correlação positiva em 
##relação à variavel target, assim como x2 tem correlação negativa
##isso terá impacto na decisao do modelo de classificação
cor(df_class)
               x1         x2     target
x1      1.0000000 -0.3415151  0.5073421
x2     -0.3415151  1.0000000 -0.6500964
target  0.5073421 -0.6500964  1.0000000
##verificando se os dados estão desbalanceados via tabela de frequencias
table(df_class$target)

  0   1 
331 339 
##Aplicando um modelo de regressão logistica, considerando a natureza da variavel target,
## e também por ser um modelo com boa performance para datasets com baixo
##número de linhas e de colunas
model_class <- glm(target ~ x1+x2, df_class, family = "binomial")
##resumo
summary(model_class)

Call:
glm(formula = target ~ x1 + x2, family = "binomial", data = df_class)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.80332  -0.49149   0.07058   0.51723   2.51267  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.4625     0.1545   2.993  0.00276 ** 
x1            1.1763     0.1380   8.525  < 2e-16 ***
x2           -3.5813     0.3134 -11.427  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 928.72  on 669  degrees of freedom
Residual deviance: 475.25  on 667  degrees of freedom
AIC: 481.25

Number of Fisher Scoring iterations: 6
##Matriz de confusão
matriz_confusao <- confusionMatrix(table(predict(model_class, type = "response") >= 0.5,
                      df_class$target == 1)[2:1, 2:1])
##Precisão
precision <- matriz_confusao[["byClass"]][["Precision"]]
##Recall
recall <- matriz_confusao[["byClass"]][["Recall"]]
##F1 score
F1 <- matriz_confusao[["byClass"]][["F1"]]


print(matriz_confusao)
Confusion Matrix and Statistics

       
        TRUE FALSE
  TRUE   282    53
  FALSE   57   278
                                          
               Accuracy : 0.8358          
                 95% CI : (0.8056, 0.8631)
    No Information Rate : 0.506           
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.6716          
                                          
 Mcnemar's Test P-Value : 0.7748          
                                          
            Sensitivity : 0.8319          
            Specificity : 0.8399          
         Pos Pred Value : 0.8418          
         Neg Pred Value : 0.8299          
             Prevalence : 0.5060          
         Detection Rate : 0.4209          
   Detection Prevalence : 0.5000          
      Balanced Accuracy : 0.8359          
                                          
       'Positive' Class : TRUE            
                                          
print(paste("Precisão:", precision))
[1] "Precisão: 0.841791044776119"
print(paste("Recall:", recall))
[1] "Recall: 0.831858407079646"
print(paste("F1:", F1))
[1] "F1: 0.836795252225519"
###Gráfico da curva Roc e sua respectiva área embaixo da curva
##92,2% é um excelente valor, mostrando a alta capacidade de predição do modelo
ROC <- roc(response = df_class$target, 
           predictor = model_class$fitted.values)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
ggplotly(
  ggroc(ROC, color = "#440154FF", size = 1) +
    geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1),
                 color="grey40",
                 size = 0.2) +
    labs(x = "Especificidade",
         y = "Sensitividade",
         title = paste("Área abaixo da curva:",
                       round(ROC$auc, 3),
                       "|",
                       "Coeficiente de Gini",
                       round((ROC$auc[1] - 0.5) / 0.5, 3))) +
    theme_bw()
)
NA
##Gráfico de impacto das variaveis explicativas
model_class %>%
  tidy() %>% 
  filter(term != "(Intercept)") %>%
  ggplot(aes(estimate, fct_reorder(term, estimate))) +
  geom_vline(xintercept = 0, color = "gray50", lty = 2, size = 1.2) +
  geom_errorbar(aes(
    xmin = estimate - std.error,
    xmax = estimate + std.error
  ),
  width = .2, color = "gray50", alpha = 0.7
  ) +
  geom_point(size = 2, color = "#85144B") +
  labs(y = NULL, x = "Variáveis Explicativas")

##Fazendo a predição no conjunto de teste
predicao <- cbind(df_class_test,
                  predict(model_class, df_class_test, type = "response")) %>% 
  rename(predict = V2)
predicao$predict <- ifelse(predicao$predict >= 0.5,1,0)

##Matriz de confusão d conjunto de teste
confusionMatrix(table(predicao$target,predicao$predict))
Confusion Matrix and Statistics

   
      0   1
  0 137  32
  1  24 137
                                          
               Accuracy : 0.8303          
                 95% CI : (0.7854, 0.8692)
    No Information Rate : 0.5121          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.6608          
                                          
 Mcnemar's Test P-Value : 0.3496          
                                          
            Sensitivity : 0.8509          
            Specificity : 0.8107          
         Pos Pred Value : 0.8107          
         Neg Pred Value : 0.8509          
             Prevalence : 0.4879          
         Detection Rate : 0.4152          
   Detection Prevalence : 0.5121          
      Balanced Accuracy : 0.8308          
                                          
       'Positive' Class : 0               
                                          
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQojIyNDYXJyZWdhIG9zIGRhZG9zCmRmX2NsYXNzIDwtIGZyZWFkKCIvZGFkb3MvZ3VzdGF2b19jcnV6L25vdm8vY2xhc3NpZmljYXRpb25fdHJhaW4uY3N2IiwKICAgICAgICAgICAgICAgICAgc2VwID0gIiwiLCBkZWMgPSAiLiIpCgpkZl9jbGFzc190ZXN0IDwtIGZyZWFkKCIvZGFkb3MvZ3VzdGF2b19jcnV6L25vdm8vY2xhc3NpZmljYXRpb25fdGVzdC5jc3YiLAogICAgICAgICAgICAgICAgICAgICAgIHNlcCA9ICIsIiwgZGVjID0gIi4iKQojIyNGdW7Dp8OjbyBza2ltIHZlcmlmaWNhIGEgZGlzcG9zacOnw6NvIGRvcyBkYWRvcyBlIHRvZGFzIGFzIHN1YXMgY2FyYWN0ZXJpc3RpY2FzCmRmX2NsYXNzICU+JSBza2ltCmBgYAoKCmBgYHtyfQojIzN0YWJlbGEgZGUgY29ycmVsYcOnw7VlcyAtLS0gw6kgcG9zc8OtdmVsIHZlcmlmaWNhciBxdWUgeDEgdGVtIGNvcnJlbGHDp8OjbyBwb3NpdGl2YSBlbSAKIyNyZWxhw6fDo28gw6AgdmFyaWF2ZWwgdGFyZ2V0LCBhc3NpbSBjb21vIHgyIHRlbSBjb3JyZWxhw6fDo28gbmVnYXRpdmEKIyNpc3NvIHRlcsOhIGltcGFjdG8gbmEgZGVjaXNhbyBkbyBtb2RlbG8gZGUgY2xhc3NpZmljYcOnw6NvCmNvcihkZl9jbGFzcykKYGBgCgpgYGB7cn0KIyN2ZXJpZmljYW5kbyBzZSBvcyBkYWRvcyBlc3TDo28gZGVzYmFsYW5jZWFkb3MgdmlhIHRhYmVsYSBkZSBmcmVxdWVuY2lhcwp0YWJsZShkZl9jbGFzcyR0YXJnZXQpCmBgYApgYGB7cn0KIyNBcGxpY2FuZG8gdW0gbW9kZWxvIGRlIHJlZ3Jlc3PDo28gbG9naXN0aWNhLCBjb25zaWRlcmFuZG8gYSBuYXR1cmV6YSBkYSB2YXJpYXZlbCB0YXJnZXQsCiMjIGUgdGFtYsOpbSBwb3Igc2VyIHVtIG1vZGVsbyBjb20gYm9hIHBlcmZvcm1hbmNlIHBhcmEgZGF0YXNldHMgY29tIGJhaXhvCiMjbsO6bWVybyBkZSBsaW5oYXMgZSBkZSBjb2x1bmFzCm1vZGVsX2NsYXNzIDwtIGdsbSh0YXJnZXQgfiB4MSt4MiwgZGZfY2xhc3MsIGZhbWlseSA9ICJiaW5vbWlhbCIpCiMjcmVzdW1vCnN1bW1hcnkobW9kZWxfY2xhc3MpCmBgYApgYGB7cn0KIyNNYXRyaXogZGUgY29uZnVzw6NvCm1hdHJpel9jb25mdXNhbyA8LSBjb25mdXNpb25NYXRyaXgodGFibGUocHJlZGljdChtb2RlbF9jbGFzcywgdHlwZSA9ICJyZXNwb25zZSIpID49IDAuNSwKICAgICAgICAgICAgICAgICAgICAgIGRmX2NsYXNzJHRhcmdldCA9PSAxKVsyOjEsIDI6MV0pCiMjUHJlY2lzw6NvCnByZWNpc2lvbiA8LSBtYXRyaXpfY29uZnVzYW9bWyJieUNsYXNzIl1dW1siUHJlY2lzaW9uIl1dCiMjUmVjYWxsCnJlY2FsbCA8LSBtYXRyaXpfY29uZnVzYW9bWyJieUNsYXNzIl1dW1siUmVjYWxsIl1dCiMjRjEgc2NvcmUKRjEgPC0gbWF0cml6X2NvbmZ1c2FvW1siYnlDbGFzcyJdXVtbIkYxIl1dCgoKcHJpbnQobWF0cml6X2NvbmZ1c2FvKQpwcmludChwYXN0ZSgiUHJlY2lzw6NvOiIsIHByZWNpc2lvbikpCnByaW50KHBhc3RlKCJSZWNhbGw6IiwgcmVjYWxsKSkKcHJpbnQocGFzdGUoIkYxOiIsIEYxKSkKYGBgCgpgYGB7cn0KIyMjR3LDoWZpY28gZGEgY3VydmEgUm9jIGUgc3VhIHJlc3BlY3RpdmEgw6FyZWEgZW1iYWl4byBkYSBjdXJ2YQojIzkyLDIlIMOpIHVtIGV4Y2VsZW50ZSB2YWxvciwgbW9zdHJhbmRvIGEgYWx0YSBjYXBhY2lkYWRlIGRlIHByZWRpw6fDo28gZG8gbW9kZWxvClJPQyA8LSByb2MocmVzcG9uc2UgPSBkZl9jbGFzcyR0YXJnZXQsIAogICAgICAgICAgIHByZWRpY3RvciA9IG1vZGVsX2NsYXNzJGZpdHRlZC52YWx1ZXMpCgpnZ3Bsb3RseSgKICBnZ3JvYyhST0MsIGNvbG9yID0gIiM0NDAxNTRGRiIsIHNpemUgPSAxKSArCiAgICBnZW9tX3NlZ21lbnQoYWVzKHggPSAxLCB4ZW5kID0gMCwgeSA9IDAsIHllbmQgPSAxKSwKICAgICAgICAgICAgICAgICBjb2xvcj0iZ3JleTQwIiwKICAgICAgICAgICAgICAgICBzaXplID0gMC4yKSArCiAgICBsYWJzKHggPSAiRXNwZWNpZmljaWRhZGUiLAogICAgICAgICB5ID0gIlNlbnNpdGl2aWRhZGUiLAogICAgICAgICB0aXRsZSA9IHBhc3RlKCLDgXJlYSBhYmFpeG8gZGEgY3VydmE6IiwKICAgICAgICAgICAgICAgICAgICAgICByb3VuZChST0MkYXVjLCAzKSwKICAgICAgICAgICAgICAgICAgICAgICAifCIsCiAgICAgICAgICAgICAgICAgICAgICAgIkNvZWZpY2llbnRlIGRlIEdpbmkiLAogICAgICAgICAgICAgICAgICAgICAgIHJvdW5kKChST0MkYXVjWzFdIC0gMC41KSAvIDAuNSwgMykpKSArCiAgICB0aGVtZV9idygpCikKCmBgYApgYGB7cn0KIyNHcsOhZmljbyBkZSBpbXBhY3RvIGRhcyB2YXJpYXZlaXMgZXhwbGljYXRpdmFzCm1vZGVsX2NsYXNzICU+JQogIHRpZHkoKSAlPiUgCiAgZmlsdGVyKHRlcm0gIT0gIihJbnRlcmNlcHQpIikgJT4lCiAgZ2dwbG90KGFlcyhlc3RpbWF0ZSwgZmN0X3Jlb3JkZXIodGVybSwgZXN0aW1hdGUpKSkgKwogIGdlb21fdmxpbmUoeGludGVyY2VwdCA9IDAsIGNvbG9yID0gImdyYXk1MCIsIGx0eSA9IDIsIHNpemUgPSAxLjIpICsKICBnZW9tX2Vycm9yYmFyKGFlcygKICAgIHhtaW4gPSBlc3RpbWF0ZSAtIHN0ZC5lcnJvciwKICAgIHhtYXggPSBlc3RpbWF0ZSArIHN0ZC5lcnJvcgogICksCiAgd2lkdGggPSAuMiwgY29sb3IgPSAiZ3JheTUwIiwgYWxwaGEgPSAwLjcKICApICsKICBnZW9tX3BvaW50KHNpemUgPSAyLCBjb2xvciA9ICIjODUxNDRCIikgKwogIGxhYnMoeSA9IE5VTEwsIHggPSAiVmFyacOhdmVpcyBFeHBsaWNhdGl2YXMiKQoKYGBgCmBgYHtyfQojI0ZhemVuZG8gYSBwcmVkacOnw6NvIG5vIGNvbmp1bnRvIGRlIHRlc3RlCnByZWRpY2FvIDwtIGNiaW5kKGRmX2NsYXNzX3Rlc3QsCiAgICAgICAgICAgICAgICAgIHByZWRpY3QobW9kZWxfY2xhc3MsIGRmX2NsYXNzX3Rlc3QsIHR5cGUgPSAicmVzcG9uc2UiKSkgJT4lIAogIHJlbmFtZShwcmVkaWN0ID0gVjIpCnByZWRpY2FvJHByZWRpY3QgPC0gaWZlbHNlKHByZWRpY2FvJHByZWRpY3QgPj0gMC41LDEsMCkKCiMjTWF0cml6IGRlIGNvbmZ1c8OjbyBkIGNvbmp1bnRvIGRlIHRlc3RlCmNvbmZ1c2lvbk1hdHJpeCh0YWJsZShwcmVkaWNhbyR0YXJnZXQscHJlZGljYW8kcHJlZGljdCkpCgpgYGAKCg==